0.1 Contexte

This dataset was generously provided by Olist, the largest department store in Brazilian marketplaces. Olist connects small businesses from all over Brazil to channels without hassle and with a single contract. Those merchants are able to sell their products through the Olist Store and ship them directly to the customers using Olist logistics partners. See more on our website: www.olist.com.

The data set comes from kaggle competition named Brazilian E-Commerce Public Dataset by Olist.

0.2 Set and load packages for Python and R

# Set python environment and version in RStudio ;-)
reticulate::use_python("/usr/bin/python3.10", required = TRUE)
reticulate::py_config()
## python:         /usr/bin/python3.10
## libpython:      /usr/lib/python3.10/config-3.10-x86_64-linux-gnu/libpython3.10.so
## pythonhome:     //usr://usr
## version:        3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]
## numpy:          /home/kirus/.local/lib/python3.10/site-packages/numpy
## numpy_version:  1.26.0
## 
## NOTE: Python version was forced by use_python() function
# Load needed R packages
library(tidyverse)
library(data.table)
library(leaflet)
library(stringr)
#require(maps)
library(ggplot2)
#require(mapview)
#require(scales)
#require(RColorBrewer)
require(lubridate)
#library(plotly)
#require(gganimate)
#require(gifski)
#library(caret)
library(DT)
library(kableExtra)

1 Load Datasets

customers <- fread("data/olist_customers_dataset.csv")
geolocation <- fread("data/olist_geolocation_dataset.csv")
order_items <- fread("data/olist_order_items_dataset.csv")
order_payments <- fread("data/olist_order_payments_dataset.csv")
order_reviews <- fread("data/olist_order_reviews_dataset.csv")
orders <- fread("data/olist_orders_dataset.csv")
products <- fread("data/olist_products_dataset.csv")
sellers <- fread("data/olist_sellers_dataset.csv")
category_translation <- fread("data/product_category_name_translation.csv")

1.1 Customers

customers <- customers |>  
   rename(zip_code_prefix = customer_zip_code_prefix)

customers |> head(1000) |>
datatable()

1.2 Geolocation

geolocation <- geolocation |> 
  rename(Longitude = geolocation_lng) |>
  rename(Latitude = geolocation_lat) |>
  rename(zip_code_prefix= geolocation_zip_code_prefix)

geolocation |> 
  head(1000) |>
  datatable()

1.3 Items

order_items |> 
  head(1000) |>
  datatable()

1.4 Payments

order_payments |>
  head(1000) |>
  datatable()

1.5 Reviews

order_reviews |> 
  head(1000) |>
  datatable()

1.6 Orders

orders |> 
  head(1000) |>
  datatable()

1.7 Categories

category_translation |> 
  head(1000) |>
  datatable()

1.8 Products

products |> 
  head(100) |>
  datatable()

1.9 Sellers

sellers <- sellers |>
          rename(zip_code_prefix = seller_zip_code_prefix)

sellers |> 
  head(1000) |>
  datatable()

2 Dataset Engineering

Compute:

\[Volume = product\_length\_cm * product\_height\_cm * product\_width\_cm / 1000\]

2.1 check for missing value in dimensions used to compute Volumes

## check for  missing value in specific columns
product_id_with_missed_data <-
products |>
select(product_id,product_category_name, product_weight_g,
       product_length_cm, product_height_cm, product_width_cm) |>
 filter_all(any_vars(is.na(.)))

product_id_with_missed_data |>
   knitr::kable() |>   kable_styling() |>
  scroll_box(width = "900px", height = "300px") 
product_id product_category_name product_weight_g product_length_cm product_height_cm product_width_cm
09ff539a621711667c43eba6a3bd8466 bebes NA NA NA NA
5eb564652db742ff8f28759cd8d2652a NA NA NA NA

2.2 Which sellers have product_id with missing data?

sellers_missing_data <-
order_items |>
  filter(product_id %in% product_id_with_missed_data$product_id) %>%
  group_by(product_id, seller_id, shipping_limit_date, price, freight_value) %>%
  count() %>%
  arrange(shipping_limit_date)

sellers_missing_data %>%
  datatable()

2.2.1 Trend of the price of products with missed informations

sellers_missing_data %>%
  #mutate(total= price+ freight_value) %>%
  pivot_longer(-c(product_id, seller_id,n,shipping_limit_date)) %>%
  ggplot(aes(x = shipping_limit_date, y= value, fill = name)) +
  geom_area()+
    facet_wrap(~product_id, scales = "free_y")

* Increase of the price during the period of study.

2.2.2 Compute the Volumes of Products

## add english name and weight range catagories

products <- products |>
            left_join(y = category_translation, by = "product_category_name") |>
            mutate(`volume dm³` = product_length_cm * product_height_cm * product_width_cm/1000) %>%
           rename(weight = product_weight_g) %>%
           mutate(`weight kg` = weight/1000)



products |> 
  head(1000)  |>
  datatable()

2.2.3 Mutate the Weight range of products

products <- products |>
    mutate(weight_range = if_else(`weight kg` <= 1,"0-1", 
                                if_else(`weight kg`> 1 &  `weight kg` <= 5, "1-5 kg",
                                        if_else(`weight kg` > 5 &`weight kg` <=10, "5-10 kg",
                                                if_else(`weight kg` > 10 & `weight kg` <= 20, "10-20 kg",
                                                        if_else(`weight kg` > 20 & `weight kg` <= 30, "20-30 kg",
                                                                if_else(`weight kg` > 30 & `weight kg` <= 42, "30-42 kg", NA )))))))

products |> 
  head(1000)  |>
  datatable()
## glimpse the density of products volumes
products |>
  drop_na(`volume dm³`, weight_range) |>
  ggplot() +
  aes(x=`volume dm³`, color= weight_range, fill= weight_range, na.rm = TRUE)+
  geom_density(adjust= 10, alpha=0.1)
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

  • The main volume of products are less than 5 dm³

2.2.4 Mutate Volume Range

## add volume range
products <- products |>
  mutate(volume_range = if_else(`volume dm³` <= 1,"0-1 dm³", 
                                if_else(`volume dm³`> 1 &  `volume dm³` <= 5, "1-5 dm³",
                                        if_else(`volume dm³` > 5 &`volume dm³` <=10, "5-10 dm³",
                                                if_else(`volume dm³` > 10 & `volume dm³` <= 20, "10-20 dm³",
                                                        if_else(`volume dm³` > 20 & `volume dm³` <= 30, "20-30 dm³",
                                                             if_else(`volume dm³` > 30 , ">30 dm3", NA ))))))) 


products |> head() |>
  datatable()
  #knitr::kable() |>   kable_styling() |>
  #scroll_box(width = "900px", height = "300px")

3 Explore Products

3.1 list categories and count same description lenght

## list categories
products %>%
  #group_by(product_category_name_english, product_name_lenght) |>
  #mutate(Qt=n()) |>
  pull(product_category_name_english) |> 
  unique() |> as.data.frame() |>
  rownames_to_column(var = "index") |> 
  knitr::kable() |>  kable_styling() |>
  scroll_box(width = "300px", height = "500px")
index unique(products %>% pull(product_category_name_english))
1 perfumery
2 art
3 sports_leisure
4 baby
5 housewares
6 musical_instruments
7 cool_stuff
8 furniture_decor
9 home_appliances
10 toys
11 bed_bath_table
12 construction_tools_safety
13 computers_accessories
14 health_beauty
15 luggage_accessories
16 garden_tools
17 office_furniture
18 auto
19 electronics
20 fashion_shoes
21 telephony
22 stationery
23 fashion_bags_accessories
24 computers
25 home_construction
26 watches_gifts
27 construction_tools_construction
28 pet_shop
29 small_appliances
30 agro_industry_and_commerce
31 NA
32 furniture_living_room
33 signaling_and_security
34 air_conditioning
35 consoles_games
36 books_general_interest
37 costruction_tools_tools
38 fashion_underwear_beach
39 fashion_male_clothing
40 kitchen_dining_laundry_garden_furniture
41 industry_commerce_and_business
42 fixed_telephony
43 construction_tools_lights
44 books_technical
45 home_appliances_2
46 party_supplies
47 drinks
48 market_place
49 la_cuisine
50 costruction_tools_garden
51 fashio_female_clothing
52 home_confort
53 audio
54 food_drink
55 music
56 food
57 tablets_printing_image
58 books_imported
59 small_appliances_home_oven_and_coffee
60 fashion_sport
61 christmas_supplies
62 fashion_childrens_clothes
63 dvds_blu_ray
64 arts_and_craftmanship
65 furniture_bedroom
66 cine_photo
67 diapers_and_hygiene
68 flowers
69 home_comfort_2
70 security_and_services
71 furniture_mattress_and_upholstery
72 cds_dvds_musicals

3.2 List product_id by categories

## count how many products by category and count how many products with the same description length
products %>%
  #filter(product_category_name== "cama_mesa_banho") |>
  #mutate(product_description_lenght= as.factor(product_description_lenght)) |>
  group_by(product_category_name_english, product_name_lenght, product_description_lenght) |>
  #mutate(Qt= n()) |>
  summarise(products_with_same_name_lenght = n(), .groups='keep') |>
  group_by(product_description_lenght) |>
  #filter(product_description_lenght=="93") |>
  mutate(products_with_same_description_lenght = n()) |>
    #head(1000) |>
  #arrange(desc(Qt)) |>
  #filter(product_category_name_english %in% c("baby", "housewares")) |>
 arrange(desc(products_with_same_name_lenght)) |>
  head(100) |>
  datatable()
  • There are 610 products without description

  • We assume that products from the same category and have different name lenght are different.

  • We assume that products with the same name lenght and different description lenght are same bu from different sellers.

3.3 check if products with the same name lenght and different description lenght are same but from different seller

group_pdt_sellers <- 
  order_items |>
  group_by(product_id, seller_id) |>
  summarise(Freq= n(), .groups="keep") |>
  arrange(desc(Freq))

group_pdt_sellers |>
  head(100) |>
  datatable()
products |>
  left_join(group_pdt_sellers[-3] , by = "product_id") |>
  select( product_category_name_english,product_name_lenght, seller_id, product_id) |>
  group_by(product_category_name_english, product_name_lenght, seller_id) |>
  mutate(products_with_same_name_lenght = n()) |>
  arrange(desc(products_with_same_name_lenght)) |>
 head(500) |>
  datatable()
  • We find which seller has missing product name: e5a3438891c0bfdb9394643f95273d8e. We can find his geolocation.

  • Several items with the same product length name exist for the same seller. It seems the same product but with different color or version or option.

  • It is possible that products belonging to different categories have the same name length and sold by different sellers.

CONCLUSION: The product_name_lenght is not specific to a specific product.

3.4 Display the ratio weight/volume of product

products |>
  drop_na(`weight kg`, `volume dm³`, `product_category_name_english`) |>
  ggplot() + 
  aes(y = `weight kg` , x = `product_category_name_english`, color= weight_range) +#, size=`volume dm³` 
  geom_point() +
    theme(legend.position = "left", #axis.text.y = element_blank(),
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
        plot.title = element_text(size = 14)
        ) +
  #facet_wrap(~weight_range, nrow = 6, scales = "free_y")+
  facet_wrap(~volume_range, nrow = 6, scales= "free_y")

* We can note some products with small volume but with high weigth and vis-versa.

4 Explore Orders

## join items and orders and products
detail_orders <- order_items |>
  left_join(orders, by = "order_id") |>
  left_join(products|> select(product_id, product_category_name_english), by = "product_id") |>
  select(order_id, order_item_id, product_id, product_category_name_english,seller_id, customer_id, everything()) 


detail_orders|>
  head(1000) |>
  datatable()

4.1 How many orders by customer

## group by order and check items
orders_customer <- detail_orders |>
  group_by( order_id, customer_id, product_category_name_english) |>
  summarise(n_items= n(), .groups = "keep") |>
  group_by( order_id, customer_id, product_category_name_english, n_items) |>
  summarise(n_orders= n(), .groups = "keep") |>
  arrange(desc(n_items)) 
  #arrange(desc(n_orders))

orders_customer |>
  head(1000) |>
  datatable()
  • The maximum number of items per orders is 21.
## check how many customers
n_customers <- detail_orders |>
  distinct(customer_id) |>
  count() |>
  pull()

## check How many orders

n_orders <- detail_orders |>
  distinct(order_id) |>
  count() |>
  pull()

print(paste0("there are ", n_customers, " Customers in this study"))
## [1] "there are 98666 Customers in this study"
print(paste0("there are ", n_orders, " Orders in this study"))
## [1] "there are 98666 Orders in this study"
  • This study shows only one orders per each customer
## visualize  categories that were purchased by customers with more than 6 items per order
orders_customer |>
  filter(n_items > 6) |>
  ggplot() +
  aes(x = reorder(product_category_name_english, desc(n_items), sum), y = n_items, fill= customer_id) +
  geom_col() +
  theme(legend.position = "bottom", axis.title.x = element_blank(),
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
         
        plot.title = element_text(size = 14)
        )

orders_seller <- detail_orders |>
  group_by( order_id, seller_id, product_category_name_english) |>
  summarise(n_items= n(), .groups = "keep") |>
  group_by( order_id,seller_id, product_category_name_english, n_items) |>
  summarise(n_orders= n(), .groups = "keep") |>
  arrange(desc(n_items)) 
  #arrange(desc(n_orders))

orders_seller |>
  head(1000) |>
  datatable()
orders_seller |>
  filter(n_items > 6) |>
  ggplot() +
  aes(x = reorder(product_category_name_english, desc(n_items), sum), y = n_items, fill= seller_id) +
  geom_col() +
  theme(legend.position = "bottom", axis.title.x = element_blank(),
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
        plot.title = element_text(size = 14)
        )

5 Distinct geolocalisation of Sellers/Customers

5.1 What is the difference between customer_id and customer_unique_id?

## count customer_id
paste0("there is ",
customers |> 
  distinct(customer_id) |>
  nrow(),
"  Customer_id in this data")
## [1] "there is 99441  Customer_id in this data"
paste0("there is ",
customers |> distinct(customer_unique_id) |> nrow(),
  "  Customer_unique_id in this data")
## [1] "there is 96096  Customer_unique_id in this data"
paste0("There is about  ",
   customers |> distinct(customer_id) |> nrow() - customers |> distinct(customer_unique_id) |> nrow(),    
      " rows with duplicated id. But we have to think about one custmer_unique_id have multiple customer_id " )
## [1] "There is about  3345 rows with duplicated id. But we have to think about one custmer_unique_id have multiple customer_id "

5.2 Group Customer_unique_id and list paires with Custumer_id

## What is the difference between customer_id and customer_unique_id
list_of_unique_id_with_multiple_customer_id <- customers |>
                                        group_by(customer_unique_id) |>
                                        ## filter only duplicated
                                        filter(n()>1) |>
                                        arrange(desc(customer_unique_id))
paste0("Here is a sample of table with  ", 
list_of_unique_id_with_multiple_customer_id |> nrow(),
"  rows. ")
## [1] "Here is a sample of table with  6342  rows. "
list_of_unique_id_with_multiple_customer_id |>
  head(1000) |>
  datatable()
  • This table lists the customer_unique_id corresponding to multiple customer_id.

5.3 Extract Customer_unique_id and their zip_code_prefix

duplicated_customer_unique_id <-
  list_of_unique_id_with_multiple_customer_id |>
  distinct(customer_unique_id) |>
  pull()
  
zip_code_prefix_of_duplicated_customer_unique_id <-
  list_of_unique_id_with_multiple_customer_id |>
  distinct(zip_code_prefix) |>
  pull()

5.4 What about the same Zip code prefix for multiple Latitude/Longiture

geolocation |> 
  filter(zip_code_prefix %in% zip_code_prefix_of_duplicated_customer_unique_id) |>
  arrange(desc(zip_code_prefix)) |>
    head(1000) |>
    datatable()

5.5 If duplicates in customers_id equal the duplicates in Lat/lon for the same Zip code?

## Filter only Zip code for customer_unique_id
  customers |>
  filter(customer_unique_id %in% duplicated_customer_unique_id) |>
  group_by(customer_id, customer_unique_id, zip_code_prefix) |>
  #distinct(customer_unique_id) |>
  arrange(desc(customer_unique_id)) |>
  filter(zip_code_prefix == 99750) |>
  datatable()

NO!

Only zip_code_prefix is not enought to distinct between geolocations of customers and sellers

For example there is only one customer with the zip_code equal to 99750.

But in geolicalisation we have 14 Positions related to this zip_code

5.6 What about Seller_id

paste0("There are  ",
sellers |>
  group_by(seller_id) %>%
  filter(n()>1),
"Duplited seller_id.")
## [1] "There are  character(0)Duplited seller_id."
## [2] "There are  integer(0)Duplited seller_id."  
## [3] "There are  character(0)Duplited seller_id."
## [4] "There are  character(0)Duplited seller_id."
sellers %>%
  group_by(zip_code_prefix) %>%
  mutate(Freq= n()) %>%
  arrange(desc(Freq)) %>%
  datatable()

6 Explore Sellers localisation

sellers_geolocation <- geolocation |> 
  filter(zip_code_prefix %in% sellers$zip_code_prefix) %>% ## IS NOT ENOUTH there are the same zip code for multiple sellers and customers.
  left_join(y = sellers, by = "zip_code_prefix")

lng1 <- min(sellers_geolocation$Longitude)
lng2 <- max(sellers_geolocation$Longitude)

lat1 <- min(sellers_geolocation$Latitude)
lat2 <- max(sellers_geolocation$Latitude)
  
leaflet(sellers_geolocation) %>%
  addTiles() %>%
  setView(lng = (lng1+lng2)/2, lat = (lat1+lat2)/2, zoom = 3) %>%
    addRectangles(
    lng1= lng1, lat1= lat1,
    lng2= lng2, lat2=lat2,
    fillColor = "transparent"
  ) %>%
    addMarkers(~Longitude, ~Latitude,
             popup = ~seller_city, label = ~seller_state,
             clusterOptions = markerClusterOptions())
customers_geolocation <- geolocation |> 
  #rename(zip_code_prefix= geolocation_zip_code_prefix) |>
  filter(zip_code_prefix %in% unique(customers$zip_code_prefix)) |> 
  left_join(y = customers, by = "zip_code_prefix")

lng1 <- min(customers_geolocation$Longitude)
lng2 <- max(customers_geolocation$Longitude)

lat1 <- min(customers_geolocation$Latitude)
lat2 <- max(customers_geolocation$Latitude)
  
# leaflet(customers_geolocation) %>%
#   addTiles() %>%
#   setView(lng = (lng1+lng2)/2, lat = (lat1+lat2)/2, zoom = 3) %>%
#     addRectangles(
#     lng1= lng1, lat1= lat1,
#     lng2= lng2, lat2=lat2,
#     fillColor = "transparent"
#   ) %>%
#     addMarkers(~Longitude, ~Latitude,
#              popup = ~customer_city, label = ~customer_state,
#              clusterOptions = markerClusterOptions())